In [1]:
from planet4 import io
from planet4.region_data import Inca

read the fast_read hdf


In [2]:
uncleaned = "/Users/klay6683/local_data/2016-11-21_planet_four_classifications_queryable.h5"

In [4]:
data = pd.read_hdf(uncleaned, 'df')

In [5]:
c_id = '50ef419195e6e40eac000001'

In [6]:
sub = data[data.classification_id==c_id]
sub.shape


Out[6]:
(1, 21)

In [7]:
sub


Out[7]:
classification_id created_at image_id image_name image_url user_name marking x_tile y_tile acquisition_date ... x y image_x image_y radius_1 radius_2 distance angle spread version
11285854 50ef419195e6e40eac000001 2013-01-10 21:32:10 APF00004y6 ESP_012053_0980 http://www.planetfour.org/subjects/standard/50... Kyle Butcher none 5 14 2009-02-20 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

1 rows × 21 columns


In [8]:
c_id = '50ef41ea95e6e42e89000001'
sub2 = data[data.classification_id==c_id]
sub2.shape


Out[8]:
(1, 21)

In [9]:
sub2


Out[9]:
classification_id created_at image_id image_name image_url user_name marking x_tile y_tile acquisition_date ... x y image_x image_y radius_1 radius_2 distance angle spread version
11285830 50ef41ea95e6e42e89000001 2013-01-10 21:32:08 APF00004y6 ESP_012053_0980 http://www.planetfour.org/subjects/standard/50... Kyle Butcher none 5 14 2009-02-20 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

1 rows × 21 columns


In [10]:
image_ids = data.image_id.unique()

In [11]:
df = data[data.image_id==image_ids[0]]

In [12]:
df = data[data.image_id=='APF00004y6']

In [13]:
df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 141 entries, 5245377 to 12944648
Data columns (total 21 columns):
classification_id    141 non-null object
created_at           141 non-null datetime64[ns]
image_id             141 non-null object
image_name           141 non-null object
image_url            141 non-null object
user_name            141 non-null object
marking              141 non-null object
x_tile               141 non-null int64
y_tile               141 non-null int64
acquisition_date     141 non-null datetime64[ns]
local_mars_time      141 non-null object
x                    45 non-null float64
y                    45 non-null float64
image_x              45 non-null float64
image_y              45 non-null float64
radius_1             4 non-null float64
radius_2             4 non-null float64
distance             0 non-null float64
angle                4 non-null float64
spread               0 non-null float64
version              0 non-null float64
dtypes: datetime64[ns](2), float64(10), int64(2), object(7)
memory usage: 24.2+ KB

In [14]:
def show_duplicates(df):
    "df needs to be image_id dataframe."
    f = lambda x: len(x.classification_id.unique())
    return df.groupby(['user_name']).apply(f).sort_values(ascending=False)

def show_kyles_dupes(df):
    "works with all image_name data."
    return show_duplicates(df[df.image_id=='APF00004y6']).head()

In [15]:
show_kyles_dupes(data)


Out[15]:
user_name
Kyle Butcher    3
yuki330         1
farren272       1
bduddy          1
chris_vipond    1
dtype: int64

In [16]:
df = data[data.image_id.isin(['APF00004y6', 'APF00003hp'])]

In [17]:
show_kyles_dupes(df)


Out[17]:
user_name
Kyle Butcher    3
yuki330         1
farren272       1
bduddy          1
chris_vipond    1
dtype: int64

In [18]:
df2 = data[data.image_id=='APF00003hp']

In [19]:
show_duplicates(df2).head()


Out[19]:
user_name
Kitharode                                         2
not-logged-in-073dee28bbc9c250d9dc02cb99f4ef93    2
bmacpherson                                       2
GazyB                                             2
blaze_carter                                      1
dtype: int64

In [20]:
user2 = df2[df2.user_name=='Kitharode']

First we find the earliest timestamp, each classification_id has in principle its own timestamp, like Kitharode's data above. But for that not-logged-in user, it has two classification_ids with the same time-stamp:


In [21]:
user2 = df2[df2.user_name=='not-logged-in-073dee28bbc9c250d9dc02cb99f4ef93']

In [22]:
user2[user2.created_at==user2.created_at.min()].classification_id.unique()


Out[22]:
array(['50ef44b795e6e42cd2000001', '50ef44b995e6e42d8c000001'], dtype=object)

Note that above is filtered for data at minimum time!

but simply doing another minimum(), like i did in previous version of filtering for this, should always work


In [23]:
user2[user2.created_at==user2.created_at.min()].classification_id.min()


Out[23]:
'50ef44b795e6e42cd2000001'

For explanation: this is the earliest AND smallest classification_id for this image_id.


In [3]:
from ipyparallel import Client
c = Client()
lbview = c.load_balanced_view()
dview = c.direct_view()

In [5]:
def process_image_name(image_name):
    "processes data for image_name (=obsid)"
    from pandas import read_hdf
    data = read_hdf(dbname, 'df', where='image_name=image_name')
    
    c_ids= []
    def process_user_group(g):
        c_ids.append(g.loc[g.created_at==g.created_at.min(),'classification_id'].min())
    
    data.groupby(['image_id', 'user_name'], sort=False).apply(process_user_group)
    tmp = data.set_index('classification_id').loc[set(c_ids)].reset_index()
    return tmp

In [7]:
from planet4.reduction import get_image_names
image_names = get_image_names(uncleaned)


INFO: Reading image_names from disk.
INFO: Got image_names

In [8]:
len(image_names)


Out[8]:
432

In [9]:
todo = image_names

In [10]:
from nbtools import display_multi_progress

In [11]:
results = lbview.map_async(process_image_name, todo)

In [12]:
display_multi_progress(results, todo)

In [13]:
len(results)


Out[13]:
432

In [22]:
len(results.result())


Out[22]:
221

In [14]:
all_df = pd.concat(results, ignore_index=True)

In [15]:
all_df.shape


Out[15]:
(13515922, 21)

In [17]:
from pathlib import Path

In [19]:
p = Path(uncleaned)

In [23]:
newname = p.name[:-3]+ '_cleaned.h5'

In [24]:
data_columns = ['classification_id', 'image_id',
                'image_name', 'user_name', 'marking',
                'acquisition_date', 'local_mars_time']

In [25]:
all_df.to_hdf(p.parent / newname, 'df', format='table', data_columns=data_columns)

In [30]:
db.dbname


Out[30]:
'/Users/klay6683/Dropbox/data/planet4/2016-05-29_planet_four_classifications_queryable_cleaned_seasons2and3.h5'

In [ ]:


In [145]:
df = data[data.image_id=='APF00004y6']

In [146]:
df.image_name.unique()


Out[146]:
array(['ESP_012053_0980'], dtype=object)

In [133]:
show_kyles_dupes(process_image_name(df))


Out[133]:
user_name
yuki330         1
aquarius1979    1
bduddy          1
chris_vipond    1
daisy186        1
dtype: int64

In [134]:
len(process_image_name(df))


Out[134]:
139

In [135]:
%timeit process_image_name(df)


10 loops, best of 3: 53.9 ms per loop

In [ ]:


In [ ]:


In [ ]:


In [156]:
%timeit process_image_name('ESP_012053_0980')


1 loop, best of 3: 21.9 s per loop

In [157]:
len(image_names)


Out[157]:
432

In [151]:
image_names = data.image_name.unique()

In [ ]:


In [ ]:


In [ ]:
filtered_data = df.groupby(['user_name']).apply(process_user_group)

In [ ]:
from planet4 import helper_functions as hf

In [ ]:
def process_image_id(image_id):
    df = data[data.image_id==image_id]
    n_class = df.classification_id.unique().size
    results = hf.classification_counts_per_user(df).value_counts()
    if not any(results.index>1):
        return n_class, n_class
    else:
        n_class_real = results[1]
        for index in results.index[results.index>1]:
            n_class_real += results[index]
        return (n_class_real, n_class)

In [ ]:
real_class_percents = []
for image_id in image_ids:
    real_class_percents.append(process_image_id(image_id))

In [ ]:
%matplotlib inline
import matplotlib.pyplot as plt

In [ ]:
s = pd.DataFrame(real_class_percents, index=image_ids, columns=['real_n_class', 'expected_n_class'])

In [ ]:
s.head()

In [ ]:
s.plot()

In [ ]:
import seaborn as sns
sns.set_context('talk')

In [ ]:
s.head()

In [ ]:
s = s.assign(fraction=s.real_n_class/s.expected_n_class)

In [ ]:
s.fraction.plot(style='.',title='Inca City, season1, fraction of good classifications')

In [ ]:
s.describe()

In [ ]:
df = data[data.image_id=='APF0000zea']
n_class= df.classification_id.unique().size
n_class
results = hf.classification_counts_per_user(df).value_counts()
results
results.index>1
n_class_real = results[1]
for index in results.index[results.index>1]:
    n_class_real += results[index]
n_class_real

Update


In [31]:
subdata = data[data.image_id=='APF00003hp']

In [33]:
from planet4 import


---------------------------------------------------------------------------
ImportError                               Traceback (most recent call last)
<ipython-input-33-00c099da19fa> in <module>()
----> 1 from planet4 import helper_functions as hf

ImportError: cannot import name 'helper_functions'

In [36]:
subdata.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 573 entries, 16915 to 12960194
Data columns (total 21 columns):
classification_id    573 non-null object
created_at           573 non-null datetime64[ns]
image_id             573 non-null object
image_name           573 non-null object
image_url            573 non-null object
user_name            573 non-null object
marking              573 non-null object
x_tile               573 non-null int64
y_tile               573 non-null int64
acquisition_date     573 non-null datetime64[ns]
local_mars_time      573 non-null object
x                    494 non-null float64
y                    494 non-null float64
image_x              494 non-null float64
image_y              494 non-null float64
radius_1             215 non-null float64
radius_2             215 non-null float64
distance             256 non-null float64
angle                471 non-null float64
spread               256 non-null float64
version              256 non-null float64
dtypes: datetime64[ns](2), float64(10), int64(2), object(7)
memory usage: 98.5+ KB

In [37]:
def pug1(g):
    c_id = g.sort_values(by='classification_id').classification_id.iloc[0]
    return g[g.classification_id == c_id]

In [38]:
def pug1b(g):
    c_id = g.sort_values(by='classification_id').classification_id.iloc[0]
    return c_id

In [39]:
def pug2(g):
    c_id = g.classification_id.min()
    return g[g.classification_id == c_id]

In [40]:
def pug2b(g):
    c_id = g.classification_id.min()
    return c_id

In [ ]:
usergroup = data.groupby(['user_name'], sort=False)

In [ ]:
%timeit usergroup.apply(pug1).reset_index(drop=True)

In [ ]:
%timeit usergroup.apply(pug2).reset_index(drop=True)

In [ ]:
%timeit data[data.classification_id.isin(usergroup.classification_id.min())]

In [ ]:
v1 = usergroup.apply(pug1).reset_index(drop=True).sort_values(by=['classification_id'])

In [ ]:
v2 = usergroup.apply(pug2).reset_index(drop=True).sort_values(by=['classification_id'])

In [ ]:
v3 = data[data.classification_id.isin(usergroup.classification_id.min())]

In [ ]:
(v1.dropna() == v3.dropna()).all()

In [ ]:
v3[v3.classification_id=='50ef44b995e6e42d8c000001']

In [ ]:
(usergroup.apply(pug1).dropna() == usergroup.apply(pug2).dropna()).all()

In [ ]:
v1.info()

In [ ]:
(v1.dropna()==v2.dropna()).all()

In [ ]:
v2.head()

In [ ]:
data.groupby(['user_name','classification_id']).apply(lambda x: len(x.classification_id.unique())).sort_values(ascending=False).min()

In [ ]:
v3 = data[data.classification_id.isin(data.groupby('user_name').classification_id.max())].sort_values(by='classification_id')

In [ ]:
(v1.classification_id.sort_values() == v2.classification_id.sort_values()).all()

In [ ]:
(g.apply(pug1).reset_index(drop=True) == g.apply(pug2).reset_index(drop=True)).all()

In [ ]:
%timeit g.apply(pug2b)

In [ ]:
img_ids = ['APF00003hp']
users = ['not-logged-in-073dee28bbc9c250d9dc02cb99f4ef93']

for img_id, user in zip(img_ids, users):
    print("image_id: ", img_id)
    print("User: ", user)
    data = db.get_image_id_markings(img_id)
    print("Before filtering classification_id created_at times:")
    print(data[data.user_name==user].created_at.unique())
    print("Classification_ids:")
    print(data[data.user_name==user].classification_id.unique())

    g = data.groupby(['user_name'])
    res = g.apply(process_user_group).reset_index(drop=True)
    print("After filtering:")
    print(res[res.user_name==user].created_at.unique())
    print(res[res.user_name==user].classification_id.unique())
    print()

In [ ]:
fname = '/Users/klay6683/data/planet4/2015-10-11_planet_four_classifications_queryable.h5'
db = io.DBManager()

In [ ]:
db.dbname

In [ ]:
data = pd.read_hdf(db.dbname, 'df',
                    where="classification_id=='50ef44b795e6e42cd2000001'")
data

In [ ]:
df = pd.read_hdf(db.dbname, 'df')

In [ ]:
df[df.classification_id=='50ef44b795e6e42cd2000001']

In [ ]:
df[df.classification_id=='50ef44b995e6e42d8c000001']

In [ ]:
df[df.classification_id=='50ee0e5694b9d564a90000b5']

In [ ]:
db.dbname

In [ ]:
df.classification_id = df.classification_id.astype('str')

In [ ]:
df.to_hdf('/Users/klay6683/data/planet4/2015-10-11_planet_four_classifications_queryable_cleaned.h5',
         'df', format='t', data_columns=reduction.data_columns)

In [ ]:
pd.read_hdf('/Users/klay6683/data/planet4/2015-10-11_planet_four_classifications_queryable_cleaned.h5',
           'df', where="classification_id=='50ef44b795e6e42cd2000001'")

In [ ]:
fname = '/Users/klay6683/data/planet4/2015-10-11_planet_four_classifications_queryable.h5'

In [ ]:
reduction.remove_duplicates_from_file(fname)

In [ ]:
data = pd.read_hdf('testing.h5', 'df',
                    where="classification_id=='50ef44b995e6e42d8c000001'")
data

In [ ]:
data2 = db.get_class_id_data('50ef44b995e6e42d8c000001')

In [ ]:
s = pd.Series(list('abc'))

In [ ]:
pd.DataFrame(s)

In [ ]:
from planet4 import io

In [ ]:
db=io.DBManager()

In [ ]:
import time

In [ ]:
imgnames = db.season2and3_image_names

In [ ]:
where = "image_name in {}".format(imgnames.values.tolist())

In [ ]:
where

In [ ]:
import time
t0 = time.time()
season23 = pd.read_hdf(db.dbname, 'df', where=where)
t1 = time.time()
print("time: ", t1 - t0)

pandas debug


In [30]:
cats = [ "s%07d" % i for i in range(4000000) ]

df = pd.DataFrame({'A' : cats})

In [31]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000000 entries, 0 to 3999999
Data columns (total 1 columns):
A    object
dtypes: object(1)
memory usage: 30.5+ MB

In [32]:
In [3]: df['B'] = df['A'].astype('category')

In [4]: df.B.cat.codes.dtype


Out[32]:
dtype('int32')

In [33]:
for i in range(3):
    df.to_hdf('test_{}.h5'.format(i),'df',mode='w',data_columns=True,format='table')

In [36]:
df = []
for i in range(3):
    df.append(pd.read_hdf('test_{}.h5'.format(i), 'df'))

In [37]:
df = pd.concat(df, ignore_index=True)

In [38]:
df[df.B=='s0000005']


Out[38]:
A B
5 s0000005 s0000005
4000005 s0000005 s0000005
8000005 s0000005 s0000005

In [39]:
df[df.B=='s3999999']


Out[39]:
A B
3999999 s3999999 s3999999
7999999 s3999999 s3999999
11999999 s3999999 s3999999

In [40]:
df.to_hdf('test.h5', 'df', format='table', data_columns=True)

In [41]:
pd.read_hdf('test.h5','df',where='A="s3999999"')


Out[41]:
A B
3999999 s3999999 s3999999
7999999 s3999999 s3999999
11999999 s3999999 s3999999

In [43]:
pd.read_hdf('test.h5','df',where='B="s3999999"')


Out[43]:
A B
3999999 s3999999 s3999999
7999999 s3999999 s3999999
11999999 s3999999 s3999999